Analytics Model

Code
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import adjusted_rand_score
eda = pd.read_csv("data/eda_data.csv")
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[1], line 8
      6 from sklearn.preprocessing import LabelEncoder
      7 from sklearn.metrics import adjusted_rand_score
----> 8 eda = pd.read_csv("data/eda_data.csv")

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)
   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    868 elif isinstance(handle, str):
    869     # Check whether the filename is to be opened in binary mode.
    870     # Binary mode does not support 'encoding' and 'newline'.
    871     if ioargs.encoding and "b" not in ioargs.mode:
    872         # Encoding
--> 873         handle = open(
    874             handle,
    875             ioargs.mode,
    876             encoding=ioargs.encoding,
    877             errors=errors,
    878             newline="",
    879         )
    880     else:
    881         # Binary mode
    882         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: 'data/eda_data.csv'
Code
eda = pd.read_csv("data/eda_data.csv")

features = eda[['SALARY', 'MAX_YEARS_EXPERIENCE', 'MIN_YEARS_EXPERIENCE']].copy()

for col in ['MAX_YEARS_EXPERIENCE', 'MIN_YEARS_EXPERIENCE', 'SALARY']:
    features[col] = pd.to_numeric(features[col], errors='coerce')

features = features.dropna()

scaler = StandardScaler()
X = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=4, random_state=688)
eda.loc[features.index, 'Cluster'] = kmeans.fit_predict(X)

true_labels = eda.loc[features.index, 'SOC_2021_4_NAME']
true_labels_encoded = LabelEncoder().fit_transform(true_labels)

ari = adjusted_rand_score(true_labels_encoded, eda.loc[features.index, 'Cluster'])
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[2], line 1
----> 1 eda = pd.read_csv("data/eda_data.csv")
      3 features = eda[['SALARY', 'MAX_YEARS_EXPERIENCE', 'MIN_YEARS_EXPERIENCE']].copy()
      5 for col in ['MAX_YEARS_EXPERIENCE', 'MIN_YEARS_EXPERIENCE', 'SALARY']:

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
   1013 kwds_defaults = _refine_defaults_read(
   1014     dialect,
   1015     delimiter,
   (...)
   1022     dtype_backend=dtype_backend,
   1023 )
   1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
    617 _validate_names(kwds.get("names", None))
    619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
    622 if chunksize or iterator:
    623     return parser

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
   1617     self.options["has_index_names"] = kwds["has_index_names"]
   1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
   1878     if "b" not in mode:
   1879         mode += "b"
-> 1880 self.handles = get_handle(
   1881     f,
   1882     mode,
   1883     encoding=self.options.get("encoding", None),
   1884     compression=self.options.get("compression", None),
   1885     memory_map=self.options.get("memory_map", False),
   1886     is_text=is_text,
   1887     errors=self.options.get("encoding_errors", "strict"),
   1888     storage_options=self.options.get("storage_options", None),
   1889 )
   1890 assert self.handles is not None
   1891 f = self.handles.handle

File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    868 elif isinstance(handle, str):
    869     # Check whether the filename is to be opened in binary mode.
    870     # Binary mode does not support 'encoding' and 'newline'.
    871     if ioargs.encoding and "b" not in ioargs.mode:
    872         # Encoding
--> 873         handle = open(
    874             handle,
    875             ioargs.mode,
    876             encoding=ioargs.encoding,
    877             errors=errors,
    878             newline="",
    879         )
    880     else:
    881         # Binary mode
    882         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: 'data/eda_data.csv'
Code
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import HTML

# 1) Build the DataFrame
df_plot = features.copy()
df_plot['Cluster'] = eda.loc[features.index, 'Cluster']

# 2) Compute centroids in original units
centroids = kmeans.cluster_centers_
centroids_x = centroids[:, 0] * X.std(axis=0)[0] + X.mean(axis=0)[0]
centroids_y = centroids[:, 1] * X.std(axis=0)[1] + X.mean(axis=0)[1]

# 3) Create an interactive Plotly Figure
fig = px.scatter(
    df_plot,
    x='SALARY',
    y='MAX_YEARS_EXPERIENCE',
    color='Cluster',
    title="KMeans Clustering by Salary and Max Years Experience",
    labels={
        'SALARY': 'Salary',
        'MAX_YEARS_EXPERIENCE': 'Max Years Experience',
        'Cluster': 'Cluster'
    },
    width=800,
    height=500,
)

# 4) Add centroid traces
fig.add_trace(
    go.Scatter(
        x=centroids_x,
        y=centroids_y,
        mode='markers',
        marker=dict(symbol='x', size=18, color='black', line=dict(width=2, color='white')),
        name='Centroids'
    )
)

fig.write_html(
    "figures/analytics_plot1.html",
    include_plotlyjs="cdn",
    full_html=True
)
fig
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 6
      3 from IPython.display import HTML
      5 # 1) Build the DataFrame
----> 6 df_plot = features.copy()
      7 df_plot['Cluster'] = eda.loc[features.index, 'Cluster']
      9 # 2) Compute centroids in original units

NameError: name 'features' is not defined

Here we have 4 cluster groups. Group 0, which represent as green have lower salary, mostly under 150k, and max years experience in 2-5 years, it is likely Likely junior to mid-level employees with moderate pay. Group 1 with orange, has medium to high salary, wide range from $100k–$500k and with narrow range ~3 years, they are suggests specialized or high-paying roles with short experience — possibly fast-track promotions or high-demand fields. cluster 2 are low salary and experience from 0-4 years, they are clearly entry level employee. cluster 3 has medium salary, mostly under 200k with higher experiences, like 6-13 eyars. They probably are senior professionals with more experience but not the highest salaries.

Code
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import plotly.graph_objects as go

# Prepare features & target
features = eda[['MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE']].apply(pd.to_numeric, errors='coerce')
features = features.dropna()
X = features
y = eda.loc[X.index, 'SALARY']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=688)

# Fit model & predict
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Metrics (optional, but handy)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse:.2f}, R²: {r2:.3f}")

# Define min/max for the identity line
min_val = y_test.min()
max_val = y_test.max()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 8
      5 import plotly.graph_objects as go
      7 # Prepare features & target
----> 8 features = eda[['MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE']].apply(pd.to_numeric, errors='coerce')
      9 features = features.dropna()
     10 X = features

NameError: name 'eda' is not defined
Code
fig = go.Figure([
    go.Scatter(
        x=y_test,
        y=y_pred,
        mode='markers',
        marker=dict(color='skyblue', opacity=0.6),
        name='Predicted vs Actual'
    ),
    go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        line=dict(color='red', dash='dash'),
        name='Ideal Fit'
    )
])

fig.update_layout(
    title='Actual vs Predicted Salary (Multiple Regression)',
    xaxis_title='Actual Salary',
    yaxis_title='Predicted Salary',
    width=800,
    height=600,
    template='plotly_white'
)


fig.write_html(
    'figures/analytics_plot2.html',
    include_plotlyjs='cdn',
    full_html=False
)
fig
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 3
      1 fig = go.Figure([
      2     go.Scatter(
----> 3         x=y_test,
      4         y=y_pred,
      5         mode='markers',
      6         marker=dict(color='skyblue', opacity=0.6),
      7         name='Predicted vs Actual'
      8     ),
      9     go.Scatter(
     10         x=[min_val, max_val],
     11         y=[min_val, max_val],
     12         mode='lines',
     13         line=dict(color='red', dash='dash'),
     14         name='Ideal Fit'
     15     )
     16 ])
     18 fig.update_layout(
     19     title='Actual vs Predicted Salary (Multiple Regression)',
     20     xaxis_title='Actual Salary',
   (...)
     24     template='plotly_white'
     25 )
     28 fig.write_html(
     29     'figures/analytics_plot2.html',
     30     include_plotlyjs='cdn',
     31     full_html=False
     32 )

NameError: name 'y_test' is not defined

This plot shows the Actual vs. Predicted Salary using a multiple linear regression model. The blue dots represent individual predictions, and the red dashed line is the ideal line where predicted = actual. Since most points lie very close to the red line, it means your model predicts salary very accurately, with minimal error and strong linear fit — likely reflected in a high R² score near 1.0.